# -*- coding:utf8 -*-
# !/usr/bin/env python

"""
#全国企业信用信息公示系统（青海）
#维护黄羽
"""

import re
import urllib2
from bs4 import BeautifulSoup
from utils import kill_captcha
# from scpy.request_util import *
from request_util import *
from parse_util.parse_basesic import parse_basesic
from scpy.logger import get_logger
from table import *
from get_page import *
import traceback


logger = get_logger(__file__)


def downloadImgAnd_kill(companyName, province):
    '''
    下载验证码，破解，然后搜索公司
    :param companyName:
    :param province:
    :return:若验证码破解成功且公司存在公司，返回公司网页。
            若公司不存在返回None
            若破解的验证码错误，返回''
            若破解过程、访问网页出现失败，抛出异样
    '''
    # proxy_conf={'ip':'192.168.31.121','port':27017,'db':'crawler_proxy','collection':'proxy'}
    if province != "qh" or not companyName:
        logger.error('输入的省份错误或公司部存在,你当前输入为,省份：%s,公司或关键字：%s' % (province, companyName))
        return None
    else:
        pass
    # 下载验证码
    imgUrl_str = r'http://218.95.241.36:9080/captchaVal.jspx'
    getCaptchaHeaders = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': '218.95.241.36:9080',
        'Referer': 'http://218.95.241.36:9080/search.jspx',
    }
    getCaptcha_request_util = RequestUtil(getCaptchaHeaders)  # 不启用代理ip
    # getCaptcha_request_util = RequestUtil()#不启用代理ip
    # getCaptcha_request_util.set_hreaders(getCaptchaHeaders)
    try:
        imgBin = getCaptcha_request_util.make_request(imgUrl_str).content
    except Exception, e:
        logger.info(e)
        raise e

    if imgBin:  # 判断下载的验证码是否正确
        pass
    else:
        return ''
    sourceStr = province
    imgformat = "jpg"
    # 验证码
    try:
        res_Code = kill_captcha(imgBin, sourceStr, imgformat)
    except Exception, e:
        logger.info("破解验证码的服务，出现异常")
        logger.info(e)
        raise e

    if not res_Code or len(res_Code) > 100 or str(res_Code) in ['None', 'wrong']:
        logger.info("破解验证码的服务，出现异常,可能是下载的验证码错误，也可能破解服务出现异常")
        logger.info("res_Code:%s" % res_Code)
        # 返回空字符串，用于重复破解
        return ''
    else:
        logger.info('验证码为:%s' % res_Code)
        pass

    # 获取公司基本信息的入口URL
    searchCompanyRegInfoHeaders = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '143',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Host': '218.95.241.36:9080',
        'Origin': 'http://218.95.241.36:9080',
        'Referer': 'http://218.95.241.36:9080/search.jspx',
        'Upgrade-Insecure-Requests': '1',
    }

    searchCompanyRegInfo_request_util = RequestUtil(searchCompanyRegInfoHeaders)  # 不启用代理ip

    searchCompanyRegInfo_dict = {
        'entName': companyName,
        'checkNo': res_Code,
    }
    searchCompanyRegInfo_url_str = 'http://218.95.241.36:9080/searchList.jspx'
    try:
        searchCompanyRegInfo_res = searchCompanyRegInfo_request_util.make_request(searchCompanyRegInfo_url_str,
                                                                                  method='post',
                                                                                  data=searchCompanyRegInfo_dict).content
    except Exception, e:
        logger.error(e)
        raise e
    searchCompanyRegInfo_soup = BeautifulSoup(searchCompanyRegInfo_res, 'html5lib')

    # 判断破解的验证码是否正确，如果验证码错误，工商网站是否返回，'验证码不正确或已失效！'的提示
    if re.compile('验证码不正确或已失效！').findall(str(searchCompanyRegInfo_soup)):
        return ''  # 返回空字符串，用于重复破解
    else:
        pass

    searchCompany = searchCompanyRegInfo_soup.find_all('div', {'class': 'list'})
    if searchCompany:
        logger.info("搜索的公司存在！")
        return searchCompany
    else:
        logger.info("搜索的公司不存在")
        return None  # 搜索的公司不存在


def getCompanyInfo(searchCompany):
    '''
    下载网页、年报、解析网页
    :param searchCompany:首页的网页
    :return:公司字典
    '''
    from table import td_clean
    from table import money_notclean

    if not searchCompany:
        return None
    else:
        pass
    companyInfo_dict = {}
    companyYearReport_list = []
    raw_html_dict = {}
    raw_base_html_dict = {}
    companyInfo_dict['punishBreakList'] = []
    companyInfo_dict['punishedList'] = []
    companyInfo_dict['alidebtList'] = []

    companyInfo_dict['entinvItemList'] = []

    companyInfo_dict['frinvList'] = []
    companyInfo_dict['frPositionList'] = []
    companyInfo_dict['filiationList'] = []
    companyInfo_dict['caseInfoList'] = []
    companyInfo_dict['sharesFrostList'] = []
    companyInfo_dict['sharesImpawnList'] = []
    companyInfo_dict['morDetailList'] = []
    companyInfo_dict['morguaInfoList'] = []
    companyInfo_dict['liquidationList'] = []

    rootUrl_str = 'http://218.95.241.36:9080'

    aCompany_url = searchCompany[0].find_all('a')[0].get('href')
    entId = str(re.compile(r'.*=(.+)').findall(aCompany_url)[0])
    aCompanyInfo_url = rootUrl_str + aCompany_url

    # 获取公司基本信息的地址
    getCompanyInfoUrlHeaders = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': '218.95.241.36:9080',
        'Upgrade-Insecure-Requests': '1',
    }
    logger.info("开始获取公司基本信息！")
    getCompanyInfoUrl_request_util = RequestUtil(getCompanyInfoUrlHeaders)  # 不启用代理ip
    try:
        getCompanyInfoUrl_res = getCompanyInfoUrl_request_util.make_request(aCompanyInfo_url, method='get').content
    except Exception, e:
        logger.error(e)
        raise e
    raw_base_html_dict['base'] = getCompanyInfoUrl_res

    logger.info("开始解析公司基本信息！")
    getCompanyInfoUrl_soup = BeautifulSoup(getCompanyInfoUrl_res, 'html5lib')

    jibenxinxi = getCompanyInfoUrl_soup.find_all(id='jibenxinxi')[0]

    # 获取登记信息中的基本信息
    jibenxinxi_table_list = jibenxinxi.find_all('table')
    basic_list = jibenxinxi_table_list[0]
    # basic_list = parse_basesic(basic_list)
    basic_list = index('基本信息', str(basic_list))

    if basic_list and isinstance(basic_list, list):
        companyInfo_dict['basicList'] = basic_list
    else:
        companyInfo_dict['basicList'] = []

    # 股东信息
    logger.info("开始获取公司股东信息！")
    shareHolderList = []
    # shareHolder_tag1 = jibenxinxi_table_list[1]
    invDiv = jibenxinxi.find_all('div', attrs={'id': 'invDiv'})
    invPagination = jibenxinxi.find('div', attrs={'id': 'invPagination'})
    if invPagination and invPagination.find_all('a'):
        share_holder_total_page = get_page(invPagination)
        if share_holder_total_page > 1:
            share_holder_page_num = range(2, share_holder_total_page + 1)
            for page in share_holder_page_num:
                share_holder_page_url = 'http://218.95.241.36:9080/QueryInvList.jspx?'
                mainId = entId
                share_holder_page_data = {'pno': page, 'mainId': mainId}
                share_holder_list_page = getCompanyInfoUrl_request_util.make_request(share_holder_page_url,
                                                                                     data=share_holder_page_data,
                                                                                     method='get').content
                share_holder_list_soup = BeautifulSoup(share_holder_list_page, 'html5lib')
                share_holder_list_table = share_holder_list_soup.find_all('table')
                if share_holder_list_table:
                    invDiv.append(share_holder_list_table[0])

    raw_share_holder_html_list = []
    logger.info("开始解析公司股东信息！")
    for ainvDiv_table in invDiv:
        invDiv_tr_tag = ainvDiv_table.find_all('tr')
        for a_invDiv_tr in invDiv_tr_tag:
            share_holder_dict = {}
            a_invDiv_td = re.findall('<td>(.*?)</td>', td_clean(str(a_invDiv_tr)))
            share_holder_dict['shareholderType'] = a_invDiv_td[0]

            queryInvDetailAction_id = re.findall(".*id=(.*).*\'\)", str(a_invDiv_tr))
            if queryInvDetailAction_id:
                queryInvDetailAction_id = queryInvDetailAction_id[0]
                share_holder_detail_url = 'http://218.95.241.36:9080/queryInvDetailAction.jspx?id=' + queryInvDetailAction_id
                try:
                    share_holder_detail_content = getCompanyInfoUrl_request_util.make_request(share_holder_detail_url,
                                                                                              method='get').content
                except Exception, e:
                    logger.error(e)
                    raise e
                share_holder_detail_soup = BeautifulSoup(share_holder_detail_content, 'html5lib')
                table_tag = share_holder_detail_soup.find_all('table', attrs={'class', 'detailsList'})[0]
                tr_tag_list = table_tag.find_all('tr')
                if len(tr_tag_list) > 3:
                    tr_tag = tr_tag_list[3]
                    td_tag = td_clean(str(tr_tag)).replace('<tr>', '').replace('</tr>', '')
                    td_res_list = re.findall("<td>(.*?)</td>", td_tag)
                    if td_res_list[0]:
                        share_holder_dict['shareholderName'] = td_res_list[0]
                    else:
                        share_holder_dict['shareholderName'] = ''
                    if td_res_list[1]:
                        share_holder_dict['subConam'] = money_notclean(td_res_list[1])
                    if td_res_list[4]:
                        if '万' in td_res_list[4]:
                            share_holder_dict['regCapCur'] = re.findall('.*万(.*)', td_res_list[4])[0]
                        else:
                            share_holder_dict['regCapCur'] = re.findall('\d*(.*)', td_res_list[4])[0]
                    if td_res_list[5]:
                        share_holder_dict['conDate'] = parse_time(td_res_list[5])
                    else:
                        share_holder_dict['conDate'] = ''
                    share_holder_dict['fundedRatio'] = ''
                    share_holder_dict['country'] = ''
                    shareHolderList.append(share_holder_dict)

                    raw_share_holder_html_list.append(share_holder_detail_content)

    companyInfo_dict['shareHolderList'] = shareHolderList
    raw_base_html_dict['shareHolder'] = raw_share_holder_html_list

    # 获取变更信息
    raw_alter_html_list = []
    from parse_util.parse_alter import parse as parse_alter
    altDiv = jibenxinxi.find_all(id='altDiv')
    alterList = parse_alter(altDiv[0]) if altDiv else ''
    jibenxinxi_table_len = len(jibenxinxi.find_all('table'))
    if jibenxinxi_table_len > 6 and jibenxinxi.find_all('table'):
        alter_page = get_page(jibenxinxi.find_all('table')[-1])
    else:
        alter_page = 0
    alter_list_table = ''
    if alter_page > 1:
        for page in range(2, alter_page + 1):
            alter_page_url = 'http://218.95.241.36:9080/QueryAltList.jspx'
            mainId = entId
            alter_page_data = {'pno': page, 'mainId': mainId}
            alter_content = getCompanyInfoUrl_request_util.make_request(alter_page_url, data=alter_page_data,
                                                                        method='get').content
            alter_list_table += alter_content
            alter_tag = BeautifulSoup(alter_content, 'html5lib')
            alterList = alterList + parse_alter(alter_tag)

            raw_alter_html_list.append(alter_content)
    else:
        pass
    if alterList:
        companyInfo_dict['alterList'] = alterList
    else:
        companyInfo_dict['alterList'] = []

    raw_base_html_dict['alter'] = raw_alter_html_list
    # 第二页 备案信息
    # 主要人员信息
    raw_mem_html_list = []
    beian = getCompanyInfoUrl_soup.find_all(id='beian')
    if beian:
        beian = beian[0].find_all('table')
    # companyInfo_dict['personList'] = personList(str(beian[1])+str(beian[2]))
    if beian and len(beian) > 2:
        person_page = get_page(beian[2])
        person_list_table = ''
        if person_page > 1:
            for page in range(2, person_page + 1):
                mem_page_url = 'http://218.95.241.36:9080/QueryMemList.jspx?'
                mainId = entId
                mem_page_data = {'pno': page, 'mainId': mainId}
                mem_content = getCompanyInfoUrl_request_util.make_request(mem_page_url, data=mem_page_data,
                                                                          method='get').content
                # person_tag = BeautifulSoup(mem_content, 'html5lib').find_all('table')
                person_list_table += mem_content

                raw_mem_html_list.append(mem_content)
        else:
            pass
        companyInfo_dict['personList'] = index('主要人员信息', str(beian[0]) + str(beian[1]) + person_list_table)
    else:
        companyInfo_dict['personList'] = []

    raw_base_html_dict['person'] = raw_mem_html_list

    # 分支机构信息
    raw_child_html_list = []
    t31 = getCompanyInfoUrl_soup.find_all(id='t31')
    childDiv = getCompanyInfoUrl_soup.find_all(id='childDiv')
    childPagination = getCompanyInfoUrl_soup.find('div', attrs={'id': 'childPagination'})
    child_list_table = ''
    if childPagination and childPagination.find_all('a'):
        child_total_page = get_page(childPagination)
        if child_total_page > 1:
            for page in range(2, child_total_page + 1):
                child_page_url = 'http://218.95.241.36:9080/QueryChildList.jspx?'
                mainId = entId
                child_page_data = {'pno': page, 'mainId': mainId}
                child_content = getCompanyInfoUrl_request_util.make_request(child_page_url, data=child_page_data,
                                                                            method='get').content
                child_list_table += child_content

                raw_child_html_list.append(child_content)

    if t31 and childDiv:
        companyInfo_dict['filiationList'] = index('分支机构信息', str(t31[0]) + str(childDiv[0]) + child_list_table)
    else:
        companyInfo_dict['filiationList'] = []

    raw_base_html_dict['filiation'] = raw_child_html_list

    # 清算
    if beian and len(beian) > 2:
        companyInfo_dict['liquidationList'] = index('清算信息', str(beian[-1]))
    else:
        companyInfo_dict['liquidationList'] = []

    # 动产抵押登记信息
    # mortDiv = getCompanyInfoUrl_soup.find_all(id='mortDiv')[0]
    companyInfo_dict['morguaInfoList'] = []

    # 股权出质登记信息
    # pledgeDiv = getCompanyInfoUrl_soup.find_all(id='pledgeDiv')[0]
    companyInfo_dict['morguaInfoList'] = []

    # 经营异常信息
    excDiv = getCompanyInfoUrl_soup.find_all(id='jingyingyichangminglu')
    if excDiv:
        companyInfo_dict['abnormalOperation'] = index('经营异常信息', str(excDiv[0]))
    else:
        companyInfo_dict['abnormalOperation'] = []

    # 抽查检查信息
    chouchaxinxiDiv = getCompanyInfoUrl_soup.find_all(id='chouchaxinxi')
    chouchaxinxi_table = chouchaxinxiDiv[0].find_all('table') if chouchaxinxiDiv else ''
    if chouchaxinxi_table:
        # index('抽查检查信息',str(chouchaxinxi_table[0]) + str(chouchaxinxi_table[1]))
        companyInfo_dict['checkMessage'] = index('抽查检查信息', str(chouchaxinxiDiv[0]))
    else:
        companyInfo_dict['checkMessage'] = []

    companyInfo_dict['province'] = 'qh'

    '''
    年报
    '''
    # 年报信息
    raw_year_html_list = []
    logger.info("开始获取公司年报！")
    nblist_url = rootUrl_str + '/enterprisePublicity.jspx?id=' + entId
    try:
        getCompanyYear_list = getCompanyInfoUrl_request_util.make_request(nblist_url, method='get').content
    except Exception, e:
        logger.error(e)
        raise e

    getCompanyYear_soup = BeautifulSoup(getCompanyYear_list, 'html5lib')
    qiyenianbao_tag = getCompanyYear_soup.find_all(id='qiyenianbao')

    if qiyenianbao_tag:
        years_tag_list = qiyenianbao_tag[0].find_all('a')
        for atag in years_tag_list:
            companyYearReport_dict = {}
            raw_year_html_dict = {}

            year = re.findall('\d+', atag.getText())
            year = str(year[0]) if year else ""

            nbdetail = atag.get('href')

            ayearUrl_str = rootUrl_str + nbdetail
            try:
                getCompanyAYearReport = getCompanyInfoUrl_request_util.make_request(ayearUrl_str, method='get').content
            except Exception, e:
                logger.error(e)
                raise e
            logger.info("开始解析公司%s年报！", year)
            getCompanyAYearReport_soup = BeautifulSoup(getCompanyAYearReport, 'html5lib')
            '''
            # 年报里面的股东信息、股权变更信息、知识产权出资登记信息等都没有信息，点击按钮也不发送请求
            '''
            qufenkuang_list = getCompanyAYearReport_soup.find_all(id='qufenkuang')

            if qufenkuang_list:
                table_list = qufenkuang_list[0].find_all('table')

                # 年报的 企业基本信息
                companyYearReport_dict['baseInfo'] = report_index('企业基本信息', str(table_list[0])) if table_list else {}
                # 年报的 网站或网店信息
                companyYearReport_dict['website'] = report_index('网站或网店信息', str(table_list[1])) if len(
                    table_list) > 1 else {}
                # 年报的 股东及出资信息
                companyYearReport_dict['investorInformations'] = report_index('股东及出资信息', str(table_list[2])) if len(
                    table_list) > 2 else []

                # 年报的 对外投资信息
                # companyYearReport_dict[''] = report_index('对外投资信息',str(table_list[2]))

                # 年报的 企业资产状况信息
                companyYearReport_dict['assetsInfo'] = report_index('企业资产状况信息', str(table_list[4])) if len(
                    table_list) > 4 else {}

                # 年报的 股权变更信息
                if len(table_list) > 6:
                    companyYearReport_dict['equityChangeInformations'] = report_index('股权变更信息', str(table_list[6]))
                else:
                    companyYearReport_dict['equityChangeInformations'] = []

                # table_list_2 = qufenkuang_list[1].find_all('table')

                # 年报的 股权变更信息
                companyYearReport_dict['changeRecords'] = report_index('修改记录', str(qufenkuang_list[1])) if len(
                    qufenkuang_list) > 1 else []
                # 年份
                companyYearReport_dict['year'] = year

            companyYearReport_list.append(companyYearReport_dict)

            # 保存源码
            raw_year_html_dict[year] = getCompanyAYearReport
            raw_year_html_list.append(raw_year_html_dict)

        companyInfo_dict['yearReportList'] = companyYearReport_list

    else:
        companyInfo_dict['yearReportList'] = []

    raw_html_dict['html'] = raw_base_html_dict
    raw_html_dict['yearList'] = raw_year_html_list
    raw_html_dict['type'] = '2'
    raw_html_dict['province'] = 'qh'
    raw_html_dict['json'] = ''
    if 'basicList' in companyInfo_dict and companyInfo_dict['basicList'][0]:
        raw_html_dict['companyName'] = companyInfo_dict['basicList'][0].get('enterpriseName', '')
    else:
        raw_html_dict['companyName'] = ''

    return raw_html_dict, companyInfo_dict


def search2(companyName):
    '''
    安徽省：ah
    :param companyName:公司名字或注册号
    :return:若公司存在返回公司信息
            若不存在返回None
    '''
    province = 'qh'
    MAXTIME = 20
    atime = MAXTIME
    searchCompany = ''

    while atime > 0 and (searchCompany == ''):
        try:
            searchCompany = downloadImgAnd_kill(companyName, province)
        except Exception, e:
            logger.info(e)
            traceback.print_exc(e)
            raise e

        # 破解验证码的错误需要重复破解验证码
        if searchCompany == '':
            atime -= 1
            if MAXTIME == atime:
                logger.info("验证码破解失败，重复破解验证码,当前设定次数为:%s ,剩余次数为:%s" % (MAXTIME, atime))
            continue

        # 公司不存在或输入关键字错误
        elif searchCompany == None:
            break
        else:
            logger.info("验证码破解成功！")

    # 解析网页
    try:
        if searchCompany:
            res = getCompanyInfo(searchCompany)
            # companyInfo_dict = getCompanyInfo(searchCompany)
            if res is None:
                raise Exception("错误!")
                # return None
            elif res and isinstance(res, tuple) and len(res) == 2:
                res[0]['keyword'] = companyName
                return res
                # return companyInfo_dict
            else:
                raise Exception("错误!")
        else:
            return None
    except Exception, e:
        traceback.print_exc(e)
        raise e


def search(companyName):
    # 校验输入
    if not companyName:
        return None

    res = search2(companyName)
    if res is None:
        return None
    else:
        return res[1]


if __name__ == "__main__":
    companyName = u'西宁三利源酒业经销部'
    # companyName = u'海东市科博瑞科技有限公司'
    # 无
    # companyName = u'青海康普股份有限公司'
    province = 'qh'

    # result = search(companyName)
    result = search2(companyName)

    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
